﻿using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace Spider
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        static int MaxArticleID = 0;

        private void btnGet_Click(object sender, EventArgs e)
        {
            txtResult.Text="正在采集中，请稍后...";
            btnGet.Enabled=false;

            string urls = txtUrl.Text;
            string catid = txtCatID.Text.Trim();
            if (string.IsNullOrEmpty(urls) || string.IsNullOrEmpty(catid)||string.IsNullOrEmpty(txtMaxArticleID.Text.Trim()))
            {
                txtResult.Text = "采集地址或类别ID不能为空";
                btnGet.Enabled = true;
                return;
            }

            MaxArticleID = Convert.ToInt32(txtMaxArticleID.Text.Trim());

            string[] arrUrls = urls.Split(new string[1] { "\r\n" }, System.StringSplitOptions.None);
            for (int x = 0; x < arrUrls.Length; x++)
            {
                WebClient webClient = new WebClient();
                webClient.Encoding = System.Text.Encoding.GetEncoding("gb2312");
                string htmlContext = webClient.DownloadString(arrUrls[x]);
                HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
                htmlDoc.LoadHtml(htmlContext);  // 加载html页面

                string xpaht = "//html[1]/body[1]/div[2]/div[6]/ul[1]/li";
                string aXPath = "//li[1]/div[1]/a";
                string imgXPath = "//li[1]/div[1]/a[1]/img";
                HtmlNode rootNode = htmlDoc.DocumentNode;
                HtmlNodeCollection categoryNodeList = rootNode.SelectNodes(xpaht);
                HtmlNode temp = null;

                List<Album> listAlbum = new List<Album>();
                foreach (HtmlNode categoryNode in categoryNodeList)
                {
                    Album album = new Album();
                    temp = HtmlNode.CreateNode(categoryNode.OuterHtml);
                    string ahtml = temp.SelectSingleNode(aXPath).Attributes["href"].Value;
                    string imghtml = temp.SelectSingleNode(imgXPath).Attributes["src"].Value;
                    string althtml = temp.SelectSingleNode(imgXPath).Attributes["alt"].Value;
                    album.AlbumHref = ahtml;
                    album.AlbumName = althtml;
                    album.AlbumImgHref = imghtml;
                    listAlbum.Add(album);
                }

                string picXpath = "//html[1]/body[1]/div[2]/div[3]/div[1]/div[2]/img";

                for (int i = 0; i < listAlbum.Count; i++)
                {
                    bool flg = true;
                    string picUrl = listAlbum[i].AlbumHref;
                    string[] arr = picUrl.Substring(7, picUrl.Length - 7).Split('/');
                    string url = "", cate = "", num = "";
                    if (arr.Length == 3)
                    {
                        url = arr[0];
                        cate = arr[1];
                        num = arr[2].Split('.')[0];
                    }
                    else if (arr.Length == 2)
                    {
                        continue;
                    }
                    if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(cate) || string.IsNullOrEmpty(num))
                    {
                        continue;
                    }

                    int count = 1;
                    List<string> picList = new List<string>();
                    while (flg)
                    {
                        if (count != 1)
                        {
                            string numpage = num + "_" + count + ".html";
                            picUrl = "http://" + url + "/" + cate + "/" + numpage;
                        }
                        string picUrlContent = webClient.DownloadString(picUrl);
                        HtmlAgilityPack.HtmlDocument picHtmlDoc = new HtmlAgilityPack.HtmlDocument();
                        picHtmlDoc.LoadHtml(picUrlContent);  // 加载html页面
                        HtmlNode picrootNode = picHtmlDoc.DocumentNode;
                        bool isEnd = picrootNode.InnerText.Contains("对不起，您访问的页面不存在");
                        if (isEnd == false)
                        {
                            HtmlNodeCollection picCollect = picrootNode.SelectNodes(picXpath);
                            HtmlNode node = HtmlNode.CreateNode(picCollect[0].OuterHtml);
                            string pic = "http://" + url + node.Attributes["src"].Value;
                            picList.Add(pic);
                            if (count == 1)
                            {
                                listAlbum[i].AlbumImgHref = pic;//将第一张图片作为封面，原封面分辨率太低
                            }
                            count++;
                        }
                        else
                        {
                            flg = false;
                        }
                    }
                    listAlbum[i].ListPic = picList;
                }

                CreateSQL(listAlbum, catid);
            }

            txtResult.Text="采集结束";
            btnGet.Enabled=true;
        }

        public void CreateSQL(List<Album> listAlbum, string catid)
        {
            string sql_album = "";
            string sql_pic = "";
            for (int i = 0; i < listAlbum.Count; i++)
            {
                if (listAlbum[i].ListPic != null)
                {
                    MaxArticleID++;
                    sql_album += "insert into pc_article (cid,title,cover,author,picfrom) values (" + catid + ",'" + listAlbum[i].AlbumName + "','" + listAlbum[i].AlbumImgHref + "','Spider','1');  \r\n";
                    foreach (string pic in listAlbum[i].ListPic)
                    {
                        sql_pic += "insert into pc_attach (article_id,uid,name,file,picfrom) values ('" + MaxArticleID + "','1','" + pic + "','" + pic + "','1');  \r\n";
                    }
                }
            }

            string num = txtNum.Text.Trim();
            SqlPrint.WriteLog(sql_album, "专辑" + num);
            SqlPrint.WriteLog(sql_pic, "图片" + num);
        }
    }
}
